import os, base64, re, logging
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
# sankey charts
import floweaver
import csv
from pprint import pprint
# map
import folium
# process issue and solution text
from nltk import RegexpParser, sent_tokenize, word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import download as nltk_download
from string import punctuation
# optional
#from functools import reduce
from itertools import chain
# gocode location
import requests
# api ip and passwords
from helpers import config
restart_df = pd.read_csv('./data/devices_2.csv', header=0, parse_dates = ['Date'], encoding = "utf-8")
restart_df.head()
Categories = restart_df[['Product Category','Brand']].groupby('Product Category').agg('count').reset_index()
Categories.columns = ['Product Category','Records_Count']
Categories
Repair_Status = restart_df[['Repair Status','Brand']].groupby('Repair Status').agg('count').reset_index()
Repair_Status.columns = ['Repair Status','Records_Count']
Repair_Status
Event_Location = restart_df[['Event','Brand']].groupby('Event').agg('count').reset_index()
Event_Location.columns = ['Event','Records_Count']
Event_Location[0:10]
Spare_parts = restart_df[['Spare parts (needed/used)','Brand']].groupby('Spare parts (needed/used)').agg('count').reset_index()
Spare_parts.columns = ['Spare parts (needed/used)','Records_Count']
Spare_parts
Restart_Group = restart_df[['Group','Brand']].groupby('Group').agg('count').reset_index()
Restart_Group.columns = ['Group','Records_Count']
Restart_Group.head()
locations = Restart_Group['Group'].values
locations[0:4]
GOOGLE_MAPS_API_URL = config.GOOGLE_MAPS_API_URL
api_key = config.api_key
geo_location = {}
for loc in locations:
params = {
'address': loc,
'key' : api_key
}
req = requests.get(GOOGLE_MAPS_API_URL, params=params)
res = req.json()
if res['status'] == 'ZERO_RESULTS':
geodata = []
# put unknown into atlantic ocean
geodata.append(14.5994)
geodata.append(28.6731)
else:
result = res['results'][0]
geodata = []
geodata.append(result['geometry']['location']['lat'])
geodata.append(result['geometry']['location']['lng'])
geo_location[loc] = geodata
geo_location['Brixton Remakery']
# save to disc here to avoid having to call api all the time
del restart_df['Spare parts (needed/used)']
restart_df['Brand'] = restart_df['Brand'].replace(np.nan,'Unknown',regex=True)
restart_df['Comments'] = restart_df['Comments'].replace(np.nan,'None',regex=True)
restart_df['Event'] = restart_df['Event'].replace(np.nan,'Unknown',regex=True)
restart_df['Group'] = restart_df['Group'].replace(np.nan,'Unknown',regex=True)
restart_df['Location_Coordinates'] = restart_df['Group'].apply(lambda x: geo_location[x])
restart_df.columns
restart_df.columns = ['Category', 'Brand', 'Comments', 'Repair_Status', 'Event_Location', 'Restart_Group', 'Event_Date', 'Location_Coordinates']
restart_df.head()
# graph datasets
Category_Brand = restart_df[['Category','Brand','Event_Date']]\
[ (restart_df['Category'] == 'Desktop computer') \
| (restart_df['Category'] == 'Laptop large') \
| (restart_df['Category'] == 'Laptop medium') \
| (restart_df['Category'] == 'Laptop small') \
| (restart_df['Category'] == 'Mobile') \
| (restart_df['Category'] == 'Tablet') \
].groupby(['Category','Brand']).count().reset_index()
Category_Brand.columns=['source','target','value']
Category_Brand.head()
Category_Repair_Status = restart_df[['Category','Repair_Status','Event_Date']]\
[ (restart_df['Category'] == 'Desktop computer') \
| (restart_df['Category'] == 'Laptop large') \
| (restart_df['Category'] == 'Laptop medium') \
| (restart_df['Category'] == 'Laptop small') \
| (restart_df['Category'] == 'Mobile') \
| (restart_df['Category'] == 'Tablet') \
].groupby(['Category','Repair_Status']).count().reset_index()
Category_Repair_Status.columns=['source','target','value']
flows_df1 = Category_Repair_Status
flows_df1['type'] = flows_df1['source']
Category_Repair_Status.head()
Brand_Repair_Status = restart_df[['Brand','Repair_Status','Event_Date']]\
[ (restart_df['Category'] == 'Desktop computer') \
| (restart_df['Category'] == 'Laptop large') \
| (restart_df['Category'] == 'Laptop medium') \
| (restart_df['Category'] == 'Laptop small') \
| (restart_df['Category'] == 'Mobile') \
| (restart_df['Category'] == 'Tablet') \
].groupby(['Brand','Repair_Status']).count().reset_index()
Brand_Repair_Status.columns=['source','target','value']
flows_df2 = Brand_Repair_Status
Brand_Repair_Status.head()
flows_df3 = pd.concat([Category_Brand, Brand_Repair_Status])
flows_df3.head()
#https://sankeyview.readthedocs.io/en/latest/tutorials/quickstart.html
# Set the default size to fit the documentation better.
size = dict(width=800, height=420)
nodes = {
'product_categories': floweaver.ProcessGroup(['Desktop computer', 'Laptop large', 'Laptop medium', 'Laptop small', 'Mobile', 'Tablet']),
'repair_status': floweaver.ProcessGroup(['End of life', 'Fixed', 'Repairable', 'Unknown']),
}
ordering = [
['product_categories'], # put "farms" on the left...
['repair_status'], # ... and "customers" on the right.
]
bundles = [floweaver.Bundle('product_categories', 'repair_status')]
# category and type keep same
products_by_Category = floweaver.Partition.Simple('type', ['Desktop computer', 'Laptop large', 'Laptop medium', 'Laptop small', 'Mobile', 'Tablet'])
# Set the colours for the labels in the partition.
palette = {'Laptop medium': 'yellowgreen', 'Laptop large': 'green', 'Laptop small': 'gold', 'Tablet': 'orange', 'Mobile': 'red', 'Desktop computer': 'blue'}
Categories_partition = floweaver.Partition.Simple('process', [
'Desktop computer', 'Laptop large', 'Laptop medium', 'Laptop small', 'Mobile', 'Tablet'
])
# This is another partition.
Repair_status_partition = floweaver.Partition.Simple('process', [
'End of life', 'Fixed', 'Repairable', 'Unknown'
])
# Update the ProcessGroup nodes to use the partitions
nodes['product_categories'].partition = Categories_partition
nodes['repair_status'].partition = Repair_status_partition
sdd = floweaver.SankeyDefinition(nodes, bundles, ordering,
flow_partition=products_by_Category)
floweaver.weave(sdd, flows_df1, palette=palette).to_widget(**size).auto_save_png('categiries_repair_status2.png')
from folium.plugins import MarkerCluster
restart_map = folium.Map(location=[51.5, -0.1], zoom_start=11)
marker_cluster = MarkerCluster().add_to(restart_map)
for ix, point in enumerate(restart_df['Location_Coordinates'].values):
folium.Marker(point
, popup = 'Restart Group: ' + restart_df['Restart_Group'].iloc[ix] + "\n Location: " + restart_df['Event_Location'].iloc[ix]
).add_to(marker_cluster)
restart_map